Reading in D.E. Andreev Data

First, we read in the D.E. Andreev supplemental data, and filter out all of the genes in which tsl is significantly affected by stress.

Then, we partition that dataset into two; genes that have tsl downregulated by stress, and genes that have tsl upregulated by stress.

Finally, we write the names of the upregulated and downregulated genes into separate files, to be parsed by read_fasta.py in conjunction with the Ensembl CDS dataset.

# Reads in D.E. Andreev data, and filters for the 10% FDR threshold for riboseq
# being True
library(readr)
library(dplyr)
library(ggplot2)

df <- read_csv("./data/hsapiens/riboseq_mRNA_deandreev.csv")
## Warning: Duplicated column names deduplicated: 'exceeds 10% FDR threshold'
## => 'exceeds 10% FDR threshold_1' [7], 'exceeds 10% FDR threshold' =>
## 'exceeds 10% FDR threshold_2' [10]
df_tsl_resp <- filter(df, `exceeds 10% FDR threshold` == "True")
df_tsl_resp$`Gene symbol` <- toupper(df_tsl_resp$`Gene symbol`)
df_tsl_resp_up <- filter(df_tsl_resp, `average ribo-seq Z-score` > 0)
df_tsl_resp_down <- filter(df_tsl_resp, `average ribo-seq Z-score` < 0)

df_tsl_resp_control <- filter(df, `exceeds 10% FDR threshold` == "False")
df_tsl_resp_control$`Gene symbol` <- toupper(df_tsl_resp_control$`Gene symbol`)


qplot(df_tsl_resp$`average ribo-seq Z-score`)

qplot(df_tsl_resp_control$`average ribo-seq Z-score`)
## Warning: Removed 2401 rows containing non-finite values (stat_bin).

# Filters to just deal with ribo-seq based data
df_tsl_resp <- df_tsl_resp[,1:4]
glimpse(df_tsl_resp)
## Observations: 3,236
## Variables: 4
## $ `Gene symbol`                          <chr> "A3GALT2", "AARS2", "AA...
## $ `average ribo-seq fold change (acORF)` <dbl> 0.695, 0.780, 0.665, 1....
## $ `average ribo-seq Z-score`             <dbl> -1.680, -0.980, -0.980,...
## $ `exceeds 10% FDR threshold`            <chr> "True", "True", "True",...
# Writes resulting filtered datasets to csv 
write_csv(select(df_tsl_resp_up, `Gene symbol`) ,"./data/hsapiens/andreev_genes_up.csv")
write_csv(select(df_tsl_resp_down, `Gene symbol`) ,"./data/hsapiens/andreev_genes_down.csv")
write_csv(select(df_tsl_resp_control, `Gene symbol`) ,"./data/hsapiens/andreev_genes_control.csv")

Plotting the number of gene repetitions in Ensembl CDS Data

As you can see, the number of repetitions doesn’t seem much affected by the up/downregulation.

df_ensembl_hits_up <- read_csv("./data/hsapiens/runs/hits_up.csv")
df_ensembl_hits_down <- read_csv("./data/hsapiens/runs/hits_down.csv")
df_ensembl_hits_control <- read_csv("./data/hsapiens/runs/hits_control.csv")

ggplot(df_ensembl_hits_up, aes(x=df_ensembl_hits_up)) + geom_histogram(bins = 30, color="black", fill="blue") +
  labs(title="Number of  gene repetitions in Ensembl CDS data",
         subtitle="Exceeds FDR 10% threshold for ribo-seq (upregulated)",
         x = "Number of repetitions",
         y = "Number of genes")

ggplot(df_ensembl_hits_down, aes(x=df_ensembl_hits_down)) + geom_histogram(bins = 30, color="black", fill="red") +
  labs(title="Number of  gene repetitions in Ensembl CDS data",
         subtitle="Exceeds FDR 10% threshold for ribo-seq (downregulated)",
         x = "Number of repetitions",
         y = "Number of genes")

ggplot(df_ensembl_hits_control, aes(x=df_ensembl_hits_control)) + geom_histogram(bins = 30, color="black", fill="green") +
  labs(title="Number of  gene repetitions in Ensembl CDS data",
         subtitle="Does not exceed FDR 10% threshold for ribo-seq",
         x = "Number of repetitions",
         y = "Number of genes")

Further filtering the FASTA candidate genes

Recall that read_fasta.py filters genes out from the entire Ensembl human CDS library, and produced two csv files, for both the upregulated and downregulated D.E. Andreev genes.

Here, we further filter those candidate records, ensuring that the coding sequences associated with the genes start with ATG, end with a valid stop codon, and have a length greater than 35. Then, we produce two datasets with one randomly selected copy for each candidate gene a given set, based on this filtering.

# Helper function for taking the right substring in R (why isn't this built in...)
substrRight <- function(x, n){
  substr(x, nchar(x)-n+1, nchar(x))
}

# Reads in the dataset generated by read_fasta.py
andreev_fasta_df_up <- read_csv("./data/hsapiens/fasta_andreev_genes_up.csv", col_names=F)
andreev_fasta_df_down <- read_csv("./data/hsapiens/fasta_andreev_genes_down.csv", col_names=F)
andreev_fasta_df_control <- read_csv("./data/hsapiens/fasta_andreev_genes_control.csv", col_names=F)

filterGenes <- function(andreev_fasta_df) {
  andreev_df <- andreev_fasta_df %>%
    # Ensures that the sequence starts with ATG and ends with a valid stop codon
    filter(substr(X2, 1, 3) == "ATG") %>%
    filter(substrRight(X2, 3) %in% c("TAG", "TAA", "TGA")) %>%
    filter(nchar(X2) > 35) %>%
    # Randomly samples one member of each represented gene
    group_by(X3) %>%
    sample_n(size = 1) %>%
    mutate(frame=substr(X2, 1, 35))
  colnames(andreev_df) <- c("FASTAHeader", "Sequence", "geneSymbol", "Frame")
  
  return(andreev_df)
}

set.seed(1)
andreev_df_up <- c()
andreev_df_down <- c()
andreev_df_control <- c()

for (i in 1:5) {
  andreev_df_up[[i]] <- filterGenes(andreev_fasta_df_up)
  andreev_df_down[[i]] <- filterGenes(andreev_fasta_df_down)
  andreev_df_control[[i]] <- filterGenes(andreev_fasta_df_control)
}

Writing the fasta files

Here, we write the FASTA files associated with the upregulated and downregulated D.E. Andreev genes, ready for use with the IGS Information Theoretic Alignment Analysis Tool.

Recall that, based on the runs in read_fasta.py, the nucleotide frequency in humans is

print(paste("A", 0.25961334304060124))
## [1] "A 0.259613343040601"
print(paste("T", 0.21846022530138462))
## [1] "T 0.218460225301385"
print(paste("G", 0.2627307219587737))
## [1] "G 0.262730721958774"
print(paste("C", 0.25919570969924044))
## [1] "C 0.25919570969924"

Writing out the filtered fasta files

Here, we’re outputting our three sets of filtered FASTA dataframes, for 5 samplings, to disk, to be used with the Information Theory IGS web tool.

for (i in 1:5) {
  andreev_df_up[[i]]$FASTAHeader <- trimws(paste(">", andreev_df_up[[i]]$FASTAHeader, sep=""))
  andreev_df_down[[i]]$FASTAHeader <- trimws(paste(">", andreev_df_down[[i]]$FASTAHeader, sep=""))
  andreev_df_control[[i]]$FASTAHeader <- trimws(paste(">", andreev_df_control[[i]]$FASTAHeader, sep=""))
  
  randomRows = function(df,n){
     return(df[sample(nrow(df),n),])
  }
  
  andreev_df_control[[i]] <- randomRows(andreev_df_control[[i]], 5000)
  
  fasta_lines_up = c(rbind(andreev_df_up[[i]]$FASTAHeader, andreev_df_up[[i]]$Frame))
  fasta_lines_down = c(rbind(andreev_df_down[[i]]$FASTAHeader, andreev_df_down[[i]]$Frame))
  fasta_lines_control = c(rbind(andreev_df_control[[i]]$FASTAHeader, andreev_df_control[[i]]$Frame))
  
  up_path <- paste("./data/hsapiens/gene_samplings_fasta/", i, 
                   "/andreev_genes_up_", i, ".fa", sep="")
  down_path <- paste("./data/hsapiens/gene_samplings_fasta/", i, 
                   "/andreev_genes_down_", i, ".fa", sep="" )
  control_path <- paste("./data/hsapiens/gene_samplings_fasta/", i, 
                   "/andreev_genes_control_", i, ".fa", sep="" )
  
  write_lines(fasta_lines_up, path=up_path)
  write_lines(fasta_lines_down, path=down_path)
  write_lines(fasta_lines_control, path=control_path)
}

Reading in Results of Information Theory Analysis

Now, we’re taking the csv representation of the results of said analysis, and producing three sets of data (corresponding to the upregulated genes, downregulated genes and control) for each of the five samplings.

infotheory_up <- c()
infotheory_down <- c()
infotheory_control <- c()


for (i in 1:5) {
  infotheory_up[[i]] <- read_csv(
      paste("./data/hsapiens/gene_samplings_fasta/", i, 
            "/infotheory_up_", i, ".csv", sep="")) %>%
      filter(pos > 3)
  infotheory_down[[i]] <- read_csv(
      paste("./data/hsapiens/gene_samplings_fasta/", i, 
            "/infotheory_down_", i, ".csv", sep="")) %>%
    filter(pos > 3)
  infotheory_control[[i]] <- read_csv(
      paste("./data/hsapiens/gene_samplings_fasta/", i, 
            "/infotheory_control_", i, ".csv", sep="")) %>%
    filter(pos > 3)
}

And now, to graph them all juxtaposed to each other, showcasing that the patterns shown seem conserved across the board.

getWeightsOfNucleotide <- function(graph_data_up, graph_data_down, graph_data_control, nt) {
  graph_data_combined_up <- graph_data_up %>%
    filter(nucleotide ==nt) %>%
    mutate(label="up")
  
  graph_data_combined_down <- graph_data_down %>%
    filter(nucleotide ==nt) %>%
    mutate(label="down")
  
  graph_data_combined_control <- graph_data_control %>%
    filter(nucleotide ==nt) %>%
    mutate(label="control")
  
  graph_data_combined <- rbind(graph_data_combined_up, 
                               graph_data_combined_down,
                               graph_data_combined_control)
  
  return(graph_data_combined)
  
}

plotResult <- function(graph_data_combined, nt, postfix) {
  ggplot(graph_data_combined, aes(x=pos, y=relativeWeight, color=label)) +
           geom_line() + scale_x_continuous(breaks = seq(5, 35, 3)) + labs(
      x= "Position in CDS",
      y= "Relative Weight (by InfoTheory)",
      title=paste(nt, "Frequency along ORF"),
      subtitle=paste("D.E. Andreev filtered genes (up + down regulated)", postfix)
    )
}
 
library(tidyr)
library(plyr)

plotInfoTheoryResults <- function(infotheory_up, infotheory_down, infotheory_control, i) {
  
  graph_data_up <- infotheory_up %>%
    select(pos, relWeightA, relWeightT, relWeightG, relWeightC) %>%
    gather(nucleotide, relativeWeight, -pos)
  graph_data_up$nucleotide <- mapvalues(graph_data_up$nucleotide,
            from=c("relWeightA","relWeightC","relWeightT", "relWeightG"), 
            to=c("A","C","T","G"))
  
  graph_data_down <- infotheory_down %>%
    select(pos, relWeightA, relWeightT, relWeightG, relWeightC) %>%
    gather(nucleotide, relativeWeight, -pos)
  graph_data_down$nucleotide <- mapvalues(graph_data_down$nucleotide,
            from=c("relWeightA","relWeightC","relWeightT", "relWeightG"), 
            to=c("A","C","T","G"))
  
  graph_data_control <- infotheory_control %>%
    select(pos, relWeightA, relWeightT, relWeightG, relWeightC) %>%
    gather(nucleotide, relativeWeight, -pos)
  graph_data_control$nucleotide <- mapvalues(graph_data_control$nucleotide,
            from=c("relWeightA","relWeightC","relWeightT", "relWeightG"), 
            to=c("A","C","T","G"))
  
  a_data_combined <- getWeightsOfNucleotide(graph_data_up, graph_data_down, 
                                                graph_data_control, "A")
  t_data_combined <- getWeightsOfNucleotide(graph_data_up, graph_data_down, 
                                                graph_data_control, "T")
  c_data_combined <- getWeightsOfNucleotide(graph_data_up, graph_data_down, 
                                                graph_data_control, "C")
  g_data_combined <- getWeightsOfNucleotide(graph_data_up, graph_data_down, 
                                                graph_data_control, "G")
  
  a <- plotResult(a_data_combined, "A", i)
  t <- plotResult(a_data_combined, "T", i)
  c <- plotResult(a_data_combined, "C", i)
  g <- plotResult(a_data_combined, "G", i)
  
  return(list(a,t,c,g))
}

res <- list()
for (i in 1:5) {
  res[i] <- list(plotInfoTheoryResults(infotheory_up[[i]], infotheory_down[[i]], infotheory_control[[i]], paste("Run:", i)))
}

library(gridExtra)
nts <- c("A","T","C","G")
for (j in 1:4) {
  grid.arrange(res[[1]][[j]] + labs(x="", y="", title="", subtitle="") + theme(legend.position="none"),
               res[[2]][[j]] + labs(x="", y="", title="", subtitle="") + theme(legend.position="none"), 
               res[[3]][[j]] + labs(x="", y="", title="", subtitle="") + theme(legend.position="none"), 
               res[[4]][[j]] + labs(x="", y="", title="", subtitle="") + theme(legend.position="none"), 
               res[[5]][[j]] + labs(x="", y="", title="", subtitle="") + theme(legend.position="none"), 
               ncol=2, nrow=3, top=paste("Periodicity of",nts[j],"in D.E. Andreev genes, 5 random sample runs"))
}

Finally, we graph the aggregation of the runs, to show general patterns across the 5 samplings absent of noise.

info_up_agg = data.frame(aaply(laply(infotheory_up, as.matrix), c(2, 3), mean))
info_down_agg = data.frame(aaply(laply(infotheory_down, as.matrix), c(2, 3), mean))
info_control_agg = data.frame(aaply(laply(infotheory_control, as.matrix), c(2, 3), mean))

plotInfoTheoryResults(info_up_agg, info_down_agg, info_control_agg, "")
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Future Controls:

dev_ingolia, grab ingolia upregulated + downregulated under stress from SQL DB, then rerun this analysis on this. Also E. coli.